check max size of utf8proc_decompose_char buffer (#291)

author Steven G. Johnson <stevenj@alum.mit.edu>

Fri, 20 Jun 2025 20:10:14 +0000 (16:10 -0400)

committer GitHub <noreply@github.com>

Fri, 20 Jun 2025 20:10:14 +0000 (16:10 -0400)
author Steven G. Johnson <stevenj@alum.mit.edu>
Fri, 20 Jun 2025 20:10:14 +0000 (16:10 -0400)
committer GitHub <noreply@github.com>
Fri, 20 Jun 2025 20:10:14 +0000 (16:10 -0400)
diff --git a/.gitignore b/.gitignore

index 3a866baf7a30a773a1ec926c005b5da57bd764c7..716378f91d9d999e693d2519738f7ecbd148255b 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,7 @@
  /test/case
  /test/iscase
  /test/custom
+/test/maxdecomposition
  /tmp/
  /mingw_static/
  /mingw_shared/
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 2f259fe63a595d83b09ec0651ca5d8c8b6884766..f07f90c57841912a9c3524e77b1f040e803145d4 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,12 +100,15 @@ if(UTF8PROC_ENABLE_TESTING)
    target_link_libraries(printproperty utf8proc)
    add_executable(valid test/tests.h test/tests.c utf8proc.h test/valid.c)
    target_link_libraries(valid utf8proc)
+  add_executable(maxdecomposition test/tests.h test/tests.c utf8proc.h test/maxdecomposition.c)
+  target_link_libraries(maxdecomposition utf8proc)
    add_test(utf8proc.testcase case)
    add_test(utf8proc.testcustom custom)
    add_test(utf8proc.testiterate iterate)
    add_test(utf8proc.testmisc misc)
    add_test(utf8proc.testprintproperty printproperty)
    add_test(utf8proc.testvalid valid)
+  add_test(utf8proc.testmaxdecomposition maxdecomposition)
  
    if (NOT WIN32)
      # no wcwidth function on Windows
diff --git a/Makefile b/Makefile

index ffa4d4daa27d168be94cfca05283c35b12dd6f15..933d7c60c0a8dd2f82a2b621a078e0d79af8ce2a 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -59,7 +59,7 @@ clean:
  ifneq ($(OS),Darwin)
         rm -f libutf8proc.so.$(MAJOR)
  endif
-       rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase
+       rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase test/maxdecomposition
         rm -rf MANIFEST.new tmp
         $(MAKE) -C bench clean
         $(MAKE) -C data clean
@@ -171,6 +171,9 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
  test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
         $(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@
  
+test/maxdecomposition: test/maxdecomposition.c test/tests.o utf8proc.o utf8proc.h test/tests.h
+       $(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/maxdecomposition.c test/tests.o utf8proc.o -o $@
+
  # make release tarball from master branch
  dist:
         git archive master --prefix=utf8proc-$(VERSION)/ -o utf8proc-$(VERSION).tar.gz
@@ -186,7 +189,7 @@ distcheck: dist
         make -C utf8proc-$(VERSION) check
         rm -rf utf8proc-$(VERSION)
  
-check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
+check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/maxdecomposition test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
         $(MAKE) -C bench
         test/normtest data/NormalizationTest.txt
         test/graphemetest data/GraphemeBreakTest.txt
@@ -197,3 +200,4 @@ check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercas
         test/case
         test/iscase data/Lowercase.txt data/Uppercase.txt
         test/custom
+       test/maxdecomposition
diff --git a/test/maxdecomposition.c b/test/maxdecomposition.c

new file mode 100644 (file)

index 0000000..699b148
--- /dev/null
+++ b/test/maxdecomposition.c
@@ -0,0 +1,22 @@
+#include "tests.h"
+
+/* Check the maximum decomposed size returned by utf8proc_decompose_char with UTF8PROC_DECOMPOSE,
+   in order to give a hint in the documentation.  The hint will need to be updated if this changes. */
+
+int main(void)
+{
+    utf8proc_int32_t dst[128];
+    utf8proc_ssize_t maxsize = 0, expected_maxsize = 4;
+    int success;
+
+    for (utf8proc_int32_t c = 0; c <= 0x110000; ++c) {
+        utf8proc_ssize_t sz = utf8proc_decompose_char(c, dst, 128, UTF8PROC_DECOMPOSE, NULL);
+        maxsize = sz > maxsize ? sz : maxsize;
+    }
+
+    success = expected_maxsize == maxsize;
+    fprintf(success ? stdout : stderr,
+            "%s: maximum decomposed size = %d chars\n",
+            success ? "SUCCEEDED" : "FAILED", (int) maxsize);
+    return !success;
+}
diff --git a/utf8proc.h b/utf8proc.h

index 7339f2e975e6fd05bb5bb593ceaf2fc9d055f4ed..c83dc585a00f0086aea1b9f0edaa3bedfa2b7388 100644 (file)
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -517,7 +517,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
   * @param dst the destination buffer.
   * @param bufsize the size of the destination buffer.
   * @param options one or more of the following flags:
- * - @ref UTF8PROC_REJECTNA  - return an error `codepoint` is unassigned
+ * - @ref UTF8PROC_REJECTNA  - return an error if `codepoint` is unassigned
   * - @ref UTF8PROC_IGNORE    - strip "default ignorable" codepoints
   * - @ref UTF8PROC_CASEFOLD  - apply Unicode casefolding
   * - @ref UTF8PROC_COMPAT    - replace certain codepoints with their
@@ -532,6 +532,11 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
   * option is used.  If the string is being processed in order, this can be initialized to 0 for
   * the beginning of the string, and is thereafter updated automatically.  Otherwise, this parameter is ignored.
   *
+ * In the current version of utf8proc, the maximum destination buffer with the @ref UTF8PROC_DECOMPOSE
+ * option is 4 elements (or double that with @ref UTF8PROC_CHARBOUND), so this is a good default size.
+ * However, this may increase in future Unicode versions, so you should always check the return value
+ * as described below.
+ *
   * @return
   * In case of success, the number of codepoints written is returned; in case
   * of an error, a negative error code is returned (utf8proc_errmsg()).
author	Steven G. Johnson <stevenj@alum.mit.edu>
	Fri, 20 Jun 2025 20:10:14 +0000 (16:10 -0400)
committer	GitHub <noreply@github.com>
	Fri, 20 Jun 2025 20:10:14 +0000 (16:10 -0400)
.gitignore		patch \| blob \| history
CMakeLists.txt		patch \| blob \| history
Makefile		patch \| blob \| history
test/maxdecomposition.c	[new file with mode: 0644]	patch \| blob
utf8proc.h		patch \| blob \| history